In [ ]:
import pandas as pd
import numpy as np
In [ ]:
pd.read_csv("tdata.csv",nrows = 10)
Out[ ]:
Trip ID Trip Start Timestamp Trip End Timestamp Trip Seconds Trip Miles Pickup Census Tract Dropoff Census Tract Pickup Community Area Dropoff Community Area Fare ... Additional Charges Trip Total Shared Trip Authorized Trips Pooled Pickup Centroid Latitude Pickup Centroid Longitude Pickup Centroid Location Dropoff Centroid Latitude Dropoff Centroid Longitude Dropoff Centroid Location
0 c4a636bddc6b0cc53d20b2cc5635ff5f0b4a8141 01/01/2020 12:00:00 AM 01/01/2020 12:00:00 AM 721 3.8 NaN 1.703198e+10 NaN 76.0 15.0 ... 7.85 27.85 False 1 NaN NaN NaN 41.979071 -87.903040 POINT (-87.9030396611 41.9790708201)
1 000f3eb1fc6020bfb8a7daf0441589d314aac546 01/01/2020 12:00:00 AM 01/01/2020 01:45:00 AM 6399 89.0 NaN 1.703123e+10 NaN 23.0 95.0 ... 3.35 98.35 False 1 NaN NaN NaN 41.899062 -87.721316 POINT (-87.7213158985 41.8990616211)
2 8af67df7ca52185b15dc4fa29f2d972448fda731 01/01/2020 12:00:00 AM 01/01/2020 12:15:00 AM 1027 8.3 NaN NaN 15.0 77.0 12.5 ... 2.55 15.05 False 1 41.954028 -87.763399 POINT (-87.7633990316 41.9540276487) 41.986712 -87.663416 POINT (-87.6634164054 41.9867117999)
3 8be05cd4d7bf1997d306fc21dd4c37b9f9413558 01/01/2020 12:00:00 AM 01/01/2020 12:00:00 AM 384 2.0 NaN NaN 28.0 28.0 5.0 ... 2.55 10.55 False 1 41.874005 -87.663518 POINT (-87.6635175498 41.874005383) 41.874005 -87.663518 POINT (-87.6635175498 41.874005383)
4 8e7e77c3dd22422740402143aa88912521922ee2 01/01/2020 12:00:00 AM 01/01/2020 12:15:00 AM 553 2.4 NaN NaN 9.0 10.0 7.5 ... 2.55 13.05 False 1 42.007613 -87.813781 POINT (-87.8137810343 42.0076125931) 41.985015 -87.804532 POINT (-87.8045320063 41.9850151008)
5 003454de9b0cb5a3d5a1723d40438f07b574749d 01/01/2020 12:00:00 AM 01/01/2020 12:15:00 AM 884 4.4 1.703101e+10 1.703104e+10 1.0 4.0 10.0 ... 2.55 13.55 False 1 42.015934 -87.666536 POINT (-87.6665362779 42.0159343756) 41.972563 -87.678846 POINT (-87.6788459662 41.9725625375)
6 9343d1e5f13822780e7f8df61463401b90a576ab 01/01/2020 12:00:00 AM 01/01/2020 12:30:00 AM 1134 4.7 NaN NaN 30.0 30.0 10.0 ... 2.55 12.55 False 1 41.839087 -87.714004 POINT (-87.714003807 41.8390869059) 41.839087 -87.714004 POINT (-87.714003807 41.8390869059)
7 94314539302831a4a82eb35789f22644b006201b 01/01/2020 12:00:00 AM 01/01/2020 12:45:00 AM 2559 18.9 NaN NaN 20.0 43.0 17.5 ... 2.55 20.05 True 2 41.924347 -87.734740 POINT (-87.7347397536 41.9243470769) 41.761578 -87.572782 POINT (-87.5727819867 41.7615779081)
8 005827e8adc1c35df898021c18fb9fd3e4f1f5cb 01/01/2020 12:00:00 AM 01/01/2020 12:15:00 AM 977 12.4 NaN NaN 32.0 NaN 17.5 ... 2.55 20.05 False 1 41.878866 -87.625192 POINT (-87.6251921424 41.8788655841) NaN NaN NaN
9 005d727d45694054ff3502870af8826c255941a5 01/01/2020 12:00:00 AM 01/01/2020 12:15:00 AM 1205 11.6 NaN NaN 69.0 27.0 17.5 ... 2.55 20.05 False 1 41.763247 -87.616134 POINT (-87.6161341112 41.7632467988) 41.878914 -87.705897 POINT (-87.7058971305 41.8789144956)

10 rows × 21 columns

In [ ]:
pd.read_csv("tdata.csv",nrows = 10).info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Trip ID                     10 non-null     object 
 1   Trip Start Timestamp        10 non-null     object 
 2   Trip End Timestamp          10 non-null     object 
 3   Trip Seconds                10 non-null     int64  
 4   Trip Miles                  10 non-null     float64
 5   Pickup Census Tract         1 non-null      float64
 6   Dropoff Census Tract        3 non-null      float64
 7   Pickup Community Area       8 non-null      float64
 8   Dropoff Community Area      9 non-null      float64
 9   Fare                        10 non-null     float64
 10  Tip                         10 non-null     int64  
 11  Additional Charges          10 non-null     float64
 12  Trip Total                  10 non-null     float64
 13  Shared Trip Authorized      10 non-null     bool   
 14  Trips Pooled                10 non-null     int64  
 15  Pickup Centroid Latitude    8 non-null      float64
 16  Pickup Centroid Longitude   8 non-null      float64
 17  Pickup Centroid Location    8 non-null      object 
 18  Dropoff Centroid Latitude   9 non-null      float64
 19  Dropoff Centroid Longitude  9 non-null      float64
 20  Dropoff Centroid Location   9 non-null      object 
dtypes: bool(1), float64(12), int64(3), object(5)
memory usage: 1.7+ KB
In [ ]:
year_data = pd.read_csv("tdata.csv", usecols={"Pickup Census Tract","Trip Miles","Trip Seconds"})
In [ ]:
year_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49871385 entries, 0 to 49871384
Data columns (total 3 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Trip Seconds         float64
 1   Trip Miles           float64
 2   Pickup Census Tract  float64
dtypes: float64(3)
memory usage: 1.1 GB
In [ ]:
year_data.head(10)
Out[ ]:
Trip Seconds Trip Miles Pickup Census Tract
0 721.0 3.8 NaN
1 6399.0 89.0 NaN
2 1027.0 8.3 NaN
3 384.0 2.0 NaN
4 553.0 2.4 NaN
5 884.0 4.4 1.703101e+10
6 1134.0 4.7 NaN
7 2559.0 18.9 NaN
8 977.0 12.4 NaN
9 1205.0 11.6 NaN
In [ ]:
year_data = year_data.rename(columns={"Trip Seconds":"Trip_Seconds","Trip Miles":"Trip_Miles","Pickup Census Tract":"GEOID"})
In [ ]:
year_data = year_data.dropna()
In [ ]:
year_data
Out[ ]:
Trip_Seconds Trip_Miles GEOID
5 884.0 4.4 1.703101e+10
10 759.0 4.0 1.703183e+10
11 248.0 1.3 1.703106e+10
12 276.0 1.1 1.703183e+10
14 513.0 2.8 1.703103e+10
... ... ... ...
49871368 1258.0 6.5 1.703103e+10
49871369 919.0 4.6 1.703183e+10
49871370 454.0 1.6 1.703107e+10
49871382 575.0 2.2 1.703108e+10
49871384 788.0 3.7 1.703107e+10

25487623 rows × 3 columns

In [ ]:
year_data = year_data.astype({"GEOID":int})
year_data.head()
Out[ ]:
Trip_Seconds Trip_Miles GEOID
5 884.0 4.4 17031010300
10 759.0 4.0 17031832600
11 248.0 1.3 17031063302
12 276.0 1.1 17031832200
14 513.0 2.8 17031030701
In [ ]:
year_data = year_data[["GEOID","Trip_Miles","Trip_Seconds"]]
year_data.head()
Out[ ]:
GEOID Trip_Miles Trip_Seconds
5 17031010300 4.4 884.0
10 17031832600 4.0 759.0
11 17031063302 1.3 248.0
12 17031832200 1.1 276.0
14 17031030701 2.8 513.0
In [ ]:
year_data.to_csv("geoid-miles-seconds.csv")
In [ ]:
year_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 25487623 entries, 5 to 49871384
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   GEOID         int64  
 1   Trip_Miles    float64
 2   Trip_Seconds  float64
dtypes: float64(2), int64(1)
memory usage: 777.8 MB
In [ ]:
trip_info = year_data.groupby("GEOID").mean().reset_index()
In [ ]:
trip_info
Out[ ]:
GEOID Trip_Miles Trip_Seconds
0 17031010100 7.179774 1180.483869
1 17031010201 6.450781 1080.175410
2 17031010202 4.742800 882.345516
3 17031010300 6.869025 1144.760704
4 17031010400 7.030327 1185.853686
... ... ... ...
1135 17031843700 5.527061 1057.733495
1136 17031843800 7.151539 1058.850033
1137 17031843900 7.479913 1096.066109
1138 17031980000 19.692362 1904.696142
1139 17031980100 16.828538 1884.554266

1140 rows × 3 columns

In [ ]:
trip_info.to_csv("trip-info.csv")
In [ ]:
pickups = year_data["GEOID"].value_counts().reset_index()
In [ ]:
pickups.columns = ["GEOID","Pickups"]
In [ ]:
pickups
Out[ ]:
GEOID Pickups
0 17031839100 1163394
1 17031980000 1046728
2 17031320100 746423
3 17031833000 739580
4 17031081700 712078
... ... ...
1135 17031826902 1
1136 17031822000 1
1137 17031801608 1
1138 17031823605 1
1139 17031823903 1

1140 rows × 2 columns

In [ ]:
import folium
import geopandas as gpd

geodata = gpd.read_file("shapes.geojson")

geodata = geodata[geodata["GEOID"].notna()]
geodata = geodata.astype({"GEOID":"int"}) 

geodata = geodata[geodata["GEOID"].isin(pickups["GEOID"])]
geodata = geodata.to_crs(epsg = 4326)
In [ ]:
geodata.head()
Out[ ]:
STATEFP COUNTYFP TRACTCE AFFGEOID GEOID NAME LSAD ALAND AWATER geometry
0 17 031 843800 1400000US17031843800 17031843800 8438 CT 1309516 0 POLYGON ((-87.64554 41.80886, -87.64068 41.808...
2 17 031 243000 1400000US17031243000 17031243000 2430 CT 324548 0 POLYGON ((-87.68195 41.89583, -87.67950 41.895...
3 17 031 250600 1400000US17031250600 17031250600 2506 CT 647765 0 POLYGON ((-87.77560 41.90925, -87.77536 41.909...
4 17 031 251700 1400000US17031251700 17031251700 2517 CT 486655 0 POLYGON ((-87.74826 41.89498, -87.74645 41.895...
5 17 031 260400 1400000US17031260400 17031260400 2604 CT 328225 0 POLYGON ((-87.74061 41.88781, -87.73571 41.887...
In [ ]:
pickup_map = folium.Map(location = [41.8,-87.6])
pickup_map
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
folium.Choropleth(
    geo_data=geodata,
    name="choropleth",
    data=pickups,
    columns=["GEOID","Pickups"],
    threshold_scale = [0,50000,100000,150000,200000,300000,500000,750000,1000000, 1200000],
    key_on="feature.properties.GEOID",
    fill_color="BuPu",
    fill_opacity=0.7,
    line_opacity=.1,
    legend_name="Pickups by Census Tract",
).add_to(pickup_map)

folium.LayerControl().add_to(pickup_map)


pickup_map
Out[ ]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
built_environment = pd.read_csv("built-environment-demographics.csv")
In [ ]:
trip_info = pd.merge(trip_info, pickups)

# main_df = pd.merge(built_environment, trip_info)
In [ ]:
main_df = pd.merge(trip_info, built_environment)
In [ ]:
main_df
Out[ ]:
GEOID Trip_Miles Trip_Seconds Pickups Unnamed: 0 Unnamed: 0.1 MedianIncome Pickup TotalPopulation Population_Density Employment_Density Percent_Zero_Car_Ownership LandUse_Diversity Distance_from_transit
0 17031010201 6.450781 1080.175410 28499 0 0 41125 1837.0 7039 59.119022 1.490412 0.019724 0.069242 234.2500
1 17031010202 4.742800 882.345516 23203 1 1 45236 1375.0 2852 36.278590 8.568341 0.027317 0.724295 193.7950
2 17031010300 6.869025 1144.760704 28471 2 2 72917 1627.0 6650 58.470922 9.582295 0.106673 0.346823 184.4025
3 17031010400 7.030327 1185.853686 32895 3 3 58438 1204.0 5153 64.711477 29.071885 0.023679 2.464812 152.4400
4 17031010501 7.247471 1185.797851 19174 4 4 52747 1310.0 4147 84.888695 3.081571 0.032108 0.066411 113.1000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
890 17031843500 8.583506 1297.040576 4929 890 918 28750 150.0 10317 23.140485 6.286981 0.085057 9.501695 537.7900
891 17031843600 5.911914 956.318086 22670 891 919 67609 915.0 2918 28.806947 3.053905 0.114072 0.230467 264.8700
892 17031843700 5.527061 1057.733495 46532 892 920 194375 1843.0 2617 14.264697 16.557479 0.100851 4.656542 283.6450
893 17031843800 7.151539 1058.850033 1527 893 921 41250 58.0 1482 4.633875 0.716691 0.092342 0.372437 274.9300
894 17031843900 7.479913 1096.066109 20451 894 922 50104 1619.0 3521 35.433393 2.294889 0.098458 0.380601 199.0220

895 rows × 14 columns

In [ ]:
main_df = main_df.drop(columns=["Unnamed: 0","Unnamed: 0.1"])
In [ ]:
main_df
Out[ ]:
GEOID Trip_Miles Trip_Seconds Pickups MedianIncome Pickup TotalPopulation Population_Density Employment_Density Percent_Zero_Car_Ownership LandUse_Diversity Distance_from_transit
0 17031010201 6.450781 1080.175410 28499 41125 1837.0 7039 59.119022 1.490412 0.019724 0.069242 234.2500
1 17031010202 4.742800 882.345516 23203 45236 1375.0 2852 36.278590 8.568341 0.027317 0.724295 193.7950
2 17031010300 6.869025 1144.760704 28471 72917 1627.0 6650 58.470922 9.582295 0.106673 0.346823 184.4025
3 17031010400 7.030327 1185.853686 32895 58438 1204.0 5153 64.711477 29.071885 0.023679 2.464812 152.4400
4 17031010501 7.247471 1185.797851 19174 52747 1310.0 4147 84.888695 3.081571 0.032108 0.066411 113.1000
... ... ... ... ... ... ... ... ... ... ... ... ...
890 17031843500 8.583506 1297.040576 4929 28750 150.0 10317 23.140485 6.286981 0.085057 9.501695 537.7900
891 17031843600 5.911914 956.318086 22670 67609 915.0 2918 28.806947 3.053905 0.114072 0.230467 264.8700
892 17031843700 5.527061 1057.733495 46532 194375 1843.0 2617 14.264697 16.557479 0.100851 4.656542 283.6450
893 17031843800 7.151539 1058.850033 1527 41250 58.0 1482 4.633875 0.716691 0.092342 0.372437 274.9300
894 17031843900 7.479913 1096.066109 20451 50104 1619.0 3521 35.433393 2.294889 0.098458 0.380601 199.0220

895 rows × 12 columns

In [ ]:
main_df.columns
Out[ ]:
Index(['GEOID', 'Trip_Miles', 'Trip_Seconds', 'Pickups', 'MedianIncome',
       'Pickup', 'TotalPopulation', 'Population_Density',
       ' Employment_Density', 'Percent_Zero_Car_Ownership',
       'LandUse_Diversity', 'Distance_from_transit'],
      dtype='object')
In [ ]:
main_df = main_df[['GEOID', 'Pickups','Trip_Miles', 'Trip_Seconds', 'MedianIncome',
        'TotalPopulation', 'Population_Density',
       ' Employment_Density', 'Percent_Zero_Car_Ownership',
       'LandUse_Diversity', 'Distance_from_transit']]
In [ ]:
import seaborn as sns

X = main_df[main_df.columns[1:12]]
In [ ]:
X
Out[ ]:
Pickups Trip_Miles Trip_Seconds MedianIncome TotalPopulation Population_Density Employment_Density Percent_Zero_Car_Ownership LandUse_Diversity Distance_from_transit
0 28499 6.450781 1080.175410 41125 7039 59.119022 1.490412 0.019724 0.069242 234.2500
1 23203 4.742800 882.345516 45236 2852 36.278590 8.568341 0.027317 0.724295 193.7950
2 28471 6.869025 1144.760704 72917 6650 58.470922 9.582295 0.106673 0.346823 184.4025
3 32895 7.030327 1185.853686 58438 5153 64.711477 29.071885 0.023679 2.464812 152.4400
4 19174 7.247471 1185.797851 52747 4147 84.888695 3.081571 0.032108 0.066411 113.1000
... ... ... ... ... ... ... ... ... ... ...
890 4929 8.583506 1297.040576 28750 10317 23.140485 6.286981 0.085057 9.501695 537.7900
891 22670 5.911914 956.318086 67609 2918 28.806947 3.053905 0.114072 0.230467 264.8700
892 46532 5.527061 1057.733495 194375 2617 14.264697 16.557479 0.100851 4.656542 283.6450
893 1527 7.151539 1058.850033 41250 1482 4.633875 0.716691 0.092342 0.372437 274.9300
894 20451 7.479913 1096.066109 50104 3521 35.433393 2.294889 0.098458 0.380601 199.0220

895 rows × 10 columns

In [ ]:
X = X.dropna()
X = X[(X != 0).all(1)]
In [ ]:
X.shape
Out[ ]:
(889, 10)
In [ ]:
sns.pairplot(X)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x7fc848d21f40>
2022-05-01T15:19:52.091899 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]:
sns.heatmap(X.corr())
Out[ ]:
<AxesSubplot:>
2022-05-01T15:20:00.152275 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]:
X.describe()
Out[ ]:
Pickups Trip_Miles Trip_Seconds MedianIncome TotalPopulation Population_Density Employment_Density Percent_Zero_Car_Ownership LandUse_Diversity Distance_from_transit
count 8.890000e+02 889.000000 889.000000 889.000000 889.000000 889.000000 889.000000 889.000000 889.000000 889.000000
mean 2.698930e+04 6.896870 1098.841019 84220.039370 3679.753656 31.174132 11.469629 0.208842 1.535303 279.243646
std 8.115247e+04 2.642166 230.613680 54337.144703 1832.146235 27.338535 49.842658 0.156357 7.612212 112.575014
min 2.000000e+00 3.322565 688.884793 11457.000000 416.000000 2.139127 0.027023 0.002972 0.004566 23.470000
25% 1.532000e+03 5.378845 954.960748 44911.000000 2279.000000 15.517109 1.572308 0.076892 0.184376 211.900000
50% 5.553000e+03 6.527606 1065.800529 67000.000000 3485.000000 25.432728 3.351064 0.172012 0.401153 257.493333
75% 1.846000e+04 7.515382 1194.269912 107267.000000 4861.000000 38.537783 7.697686 0.304828 0.950401 327.900000
max 1.163394e+06 37.400000 2655.400000 250000.000000 19889.000000 407.584189 1102.545869 0.740122 176.285714 1062.170000
In [ ]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
2022-05-01T15:20:02.242253 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]:
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X)
clusters = kmeans.predict(X)
In [ ]:
cluster_centers = pd.DataFrame(kmeans.cluster_centers_)
cluster_centers.columns = X.columns.values
cluster_centers
Out[ ]:
Pickups Trip_Miles Trip_Seconds MedianIncome TotalPopulation Population_Density Employment_Density Percent_Zero_Car_Ownership LandUse_Diversity Distance_from_transit
0 8348.449929 7.248445 1131.588034 61194.454161 3672.187588 27.764839 4.392629 0.207883 0.936670 285.514218
1 644577.600000 4.490997 805.811809 185783.900000 8360.400000 72.685729 297.063445 0.339215 14.438450 173.048917
2 68403.870588 5.572115 979.503715 174275.929412 3435.976471 42.951031 24.185244 0.205175 3.272948 259.338417
In [ ]:
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import dendrogram
from pandas.plotting import parallel_coordinates
In [ ]:
palette = sns.color_palette("bright", 10)
In [ ]:
def addAlpha(colour, alpha):
    '''Add an alpha to the RGB colour'''
    
    return (colour[0],colour[1],colour[2],alpha)
In [ ]:
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    '''Display a scatter plot on a factorial plane, one for each factorial plane'''

    # For each factorial plane
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # Initialise the matplotlib figure      
            fig = plt.figure(figsize=(7,6))
        
            # Display the points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
                plt.legend()

            # Display the labels on the points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # Define the limits of the chart
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # Display grid lines
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # Label the axes, with the percentage of variance explained
            plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection of points (on PC{} and PC{})".format(d1+1, d2+1))
            #plt.show(block=False)
In [ ]:
def display_parallel_coordinates(df, num_clusters):
    '''Display a parallel coordinates plot for the clusters in df'''

    # Select data points for individual clusters
    cluster_points = []
    for i in range(num_clusters):
        cluster_points.append(df[df.cluster==i])
    
    # Create the plot
    fig = plt.figure(figsize=(12, 15))
    title = fig.suptitle("Parallel Coordinates Plot for the Clusters", fontsize=18)
    fig.subplots_adjust(top=0.95, wspace=0)

    # Display one plot for each cluster, with the lines for the main cluster appearing over the lines for the other clusters
    for i in range(num_clusters):    
        plt.subplot(num_clusters, 1, i+1)
        for j,c in enumerate(cluster_points): 
            if i!= j:
                pc = parallel_coordinates(c, 'cluster', color=[addAlpha(palette[j],0.2)])
        pc = parallel_coordinates(cluster_points[i], 'cluster', color=[addAlpha(palette[i],0.5)])

        # Stagger the axes
        ax=plt.gca()
        for tick in ax.xaxis.get_major_ticks()[1::2]:
            tick.set_pad(20)        


def display_parallel_coordinates_centroids(df, num_clusters):
    '''Display a parallel coordinates plot for the centroids in df'''

    # Create the plot
    fig = plt.figure(figsize=(12, 5))
    title = fig.suptitle("Parallel Coordinates plot for the Centroids", fontsize=18)
    fig.subplots_adjust(top=0.9, wspace=0)

    # Draw the chart
    parallel_coordinates(df, 'cluster', color=palette)

    # Stagger the axes
    ax=plt.gca()
    for tick in ax.xaxis.get_major_ticks()[1::2]:
        tick.set_pad(20)    
In [ ]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc_X = sc.fit_transform(X)
sc_X = pd.DataFrame(sc_X)
In [ ]:
sc_X
Out[ ]:
0 1 2 3 4 5 6 7 8 9
0 0.018614 -0.168930 -0.080984 -0.793551 1.834535 1.022755 -0.200327 -1.210209 -0.192702 -0.399902
1 -0.046683 -0.815725 -0.939308 -0.717851 -0.452049 0.186818 -0.058242 -1.161620 -0.106600 -0.759465
2 0.018269 -0.010545 0.199232 -0.208134 1.622097 0.999035 -0.037887 -0.653805 -0.156216 -0.842945
3 0.072814 0.050539 0.377522 -0.474750 0.804562 1.227433 0.353355 -1.184900 0.122176 -1.127026
4 -0.096358 0.132769 0.377279 -0.579544 0.255170 1.965899 -0.168385 -1.130961 -0.193074 -1.476679
... ... ... ... ... ... ... ... ... ... ...
884 -0.271991 0.638713 0.859928 -1.021424 3.624701 -0.294023 -0.104039 -0.792128 1.047117 2.297951
885 -0.053254 -0.372993 -0.618364 -0.305875 -0.416005 -0.086637 -0.168941 -0.606457 -0.171510 -0.127752
886 0.240950 -0.518734 -0.178353 2.028391 -0.580386 -0.618868 0.102136 -0.691061 0.410261 0.039119
887 -0.313935 0.096441 -0.173509 -0.791249 -1.200227 -0.971347 -0.215859 -0.745509 -0.152849 -0.038340
888 -0.080613 0.220793 -0.012039 -0.628212 -0.086698 0.155885 -0.184178 -0.706372 -0.151776 -0.713007

889 rows × 10 columns

In [ ]:
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from scipy.cluster.hierarchy import dendrogram
from pandas.plotting import parallel_coordinates
In [ ]:
X_scaled_clustered = pd.DataFrame(sc_X, columns=sc_X.columns, index=sc_X.index)
X_scaled_clustered['cluster'] = clusters

X_scaled_clustered.head()
Out[ ]:
0 1 2 3 4 5 6 7 8 9 cluster
0 0.018614 -0.168930 -0.080984 -0.793551 1.834535 1.022755 -0.200327 -1.210209 -0.192702 -0.399902 0
1 -0.046683 -0.815725 -0.939308 -0.717851 -0.452049 0.186818 -0.058242 -1.161620 -0.106600 -0.759465 0
2 0.018269 -0.010545 0.199232 -0.208134 1.622097 0.999035 -0.037887 -0.653805 -0.156216 -0.842945 0
3 0.072814 0.050539 0.377522 -0.474750 0.804562 1.227433 0.353355 -1.184900 0.122176 -1.127026 0
4 -0.096358 0.132769 0.377279 -0.579544 0.255170 1.965899 -0.168385 -1.130961 -0.193074 -1.476679 0
In [ ]:
from sklearn.decomposition import PCA

# Create a PCA model to reduce our data to 2 dimensions for visualisation
pca = PCA(n_components=2)
pca.fit(sc_X)

# Transfor the scaled data to the new PCA space
X_reduced = pca.transform(sc_X)
In [ ]:
# Convert to a data frame
X_reduceddf = pd.DataFrame(X_reduced, index=X.index, columns=['PC1','PC2'])
X_reduceddf['cluster'] = clusters
X_reduceddf.head()
Out[ ]:
PC1 PC2 cluster
0 -0.057504 0.243946 0
1 0.430891 -1.094114 0
2 0.193360 0.337670 0
3 0.296173 0.362192 0
4 0.250216 -0.355774 0
In [ ]:
display_factorial_planes(X_reduced, 2, pca, [(0,1)], illustrative_var = clusters, alpha = 0.8)
2022-05-01T15:20:03.087743 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]:
# Add the cluster number to the original scaled data
X_clustered = pd.DataFrame(sc_X, index=sc_X.index, columns=sc_X.columns)
X_clustered["cluster"] = clusters

# Display parallel coordinates plots, one for each cluster
display_parallel_coordinates(X_clustered, 3)
2022-05-01T15:20:05.566603 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]:
# Create a data frame containing our centroids
centroids = pd.DataFrame(sc.fit_transform(kmeans.cluster_centers_), columns=X.columns)
centroids['cluster'] = centroids.index

display_parallel_coordinates_centroids(centroids, 10)
2022-05-01T15:20:07.159170 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]:
cluster_distribution = X_reduceddf["cluster"].value_counts().to_frame().reset_index()
In [ ]:
cluster_distribution
Out[ ]:
index cluster
0 0 709
1 2 170
2 1 10
In [ ]:
cluster_distribution.columns = ["cluster","number"]
In [ ]:
cluster_distribution
Out[ ]:
cluster number
0 0 709
1 2 170
2 1 10
In [ ]:
plt.pie(cluster_distribution["number"], labels = ["Cluster 0","Cluster 2","Cluster 1"], shadow=True)
Out[ ]:
([<matplotlib.patches.Wedge at 0x7fc8221529a0>,
  <matplotlib.patches.Wedge at 0x7fc8221631c0>,
  <matplotlib.patches.Wedge at 0x7fc822163910>],
 [Text(-0.8848651129616366, 0.653462877188437, 'Cluster 0'),
  Text(0.861225012024064, -0.6843182583156402, 'Cluster 2'),
  Text(1.0993132202631353, -0.0388644278832949, 'Cluster 1')])
2022-05-01T15:20:07.402937 image/svg+xml Matplotlib v3.4.3, https://matplotlib.org/
In [ ]: